Setup
knitr::opts_chunk$set(echo = TRUE)
source("RaceID2_StemID_class.R")
## install required packages (only at first time)
install.packages(c("tsne","pheatmap","MASS","cluster","mclust","flexmix","lattice","fpc","RColorBrewer","permute","amap","locfit","vegan"), repos = "http://cran.us.r-project.org")
## Error in install.packages : Updating loaded packages
## input data
x <- read.csv("D:/Documents/SCHOOL/VU/2017-2018 Master Year 2/Project/Seperate Scripts/lymphnode-sc-transcriptomics/data/3 - combinedcounts/LNS_W_ALL.csv",sep=",",header=TRUE)
rownames(x) <- x$GENEID
# prdata: data.frame with transcript counts for all genes (rows) in all cells (columns); with rownames == gene ids; remove ERCC spike-ins
prdata <- x[grep("ERCC",rownames(x),invert=TRUE),-1]
RaceID Script
## RaceID2
# initialize SCseq object with transcript counts
sc <- SCseq(prdata)
# filtering of expression data
sc <- filterdata(sc, mintotal=1500, minexpr=5, minnumber=1, maxexpr=500, downsample=FALSE, dsn=1, rseed=17000)
# k-medoids clustering
sc <- clustexp(sc,clustnr=30,bootnr=50,metric="pearson",do.gap=FALSE,sat=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000,FUNcluster="kmedoids")
## Clustering k = 1,2,..., K.max (= 30): .. done
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
# compute t-SNE map
sc <- comptsne(sc,rseed=15555)
## sigma summary: Min. : 0.0895715709120785 |1st Qu. : 0.126044344689149 |Median : 0.140730395072168 |Mean : 0.147191667902368 |3rd Qu. : 0.160472081150269 |Max. : 0.315411608654121 |
## Epoch: Iteration #100 error is: 1.49199896725629
## Epoch: Iteration #200 error is: 1.38758770536514
## Epoch: Iteration #300 error is: 1.3292023285217
## Epoch: Iteration #400 error is: 1.29767623172042
## Epoch: Iteration #500 error is: 1.28299126515621
## Epoch: Iteration #600 error is: 1.27481716272011
## Epoch: Iteration #700 error is: 1.27009746271147
## Epoch: Iteration #800 error is: 1.26698531272644
## Epoch: Iteration #900 error is: 1.26492125272787
## Epoch: Iteration #1000 error is: 1.26354699253662
# detect outliers and redefine clusters
sc <- findoutliers(sc, outminc=5,outlg=2,probthr=1e-3,thr=2**-(1:40),outdistquant=.95)
## [1] 5.000000e-01 2.500000e-01 1.250000e-01 6.250000e-02 3.125000e-02
## [6] 1.562500e-02 7.812500e-03 3.906250e-03 1.953125e-03 9.765625e-04
## [11] 4.882812e-04 2.441406e-04 1.220703e-04 6.103516e-05 3.051758e-05
## [16] 1.525879e-05 7.629395e-06 3.814697e-06 1.907349e-06 9.536743e-07
## [21] 4.768372e-07 2.384186e-07 1.192093e-07 5.960464e-08 2.980232e-08
## [26] 1.490116e-08 7.450581e-09 3.725290e-09 1.862645e-09 9.313226e-10
## [31] 4.656613e-10 2.328306e-10 1.164153e-10 5.820766e-11 2.910383e-11
## [36] 1.455192e-11 7.275958e-12 3.637979e-12 1.818989e-12 9.094947e-13
RaceID Diagnostic Plots
## diagnostic plots
# gap statistics: only if do.gap == TRUE
##plotgap(sc)
# plot within-cluster dispersion as a function of the cluster number: only if sat == TRUE
plotsaturation(sc,disp=TRUE)

# plot change of the within-cluster dispersion as a function of the cluster number: only if sat == TRUE
plotsaturation(sc)

# silhouette of k-medoids clusters
plotsilhouette(sc)

# Jaccard's similarity of k-medoids clusters
plotjaccard(sc)

# barchart of outlier probabilities
plotoutlierprobs(sc)

# regression of background model
plotbackground(sc)

# dependence of outlier number on probability threshold (probthr)
plotsensitivity(sc)

# heatmap of k-medoids cluster
clustheatmap(sc,final=FALSE,hmethod="single")

## [1] 2 1 9 8 11 3 7 5 12 10 4 6
# heatmap of final cluster
clustheatmap(sc,final=TRUE,hmethod="single")

## [1] 44 41 47 39 37 23 42 26 36 18 48 49 40 33 21 25 28 35 17 30 38 27 31
## [24] 13 29 24 4 32 20 46 8 43 11 12 16 3 1 7 14 6 10 34 9 19 22 5
## [47] 15 2 45
# highlight k-medoids clusters in t-SNE map
plottsne(sc,final=FALSE)

# highlight final clusters in t-SNE map
plottsne(sc,final=TRUE)

# highlight cell labels in t-SNE map
plotlabelstsne(sc,labels=sub("(\\_\\d+)","",names(sc@ndata)))

# highlight groups of cells by symbols in t-SNE map
plotsymbolstsne(sc,types=sub("(\\_\\d+)$","", names(sc@ndata)))

# highlight transcirpt counts of a set of genes in t-SNE map, e. g. all Apoa genes
#g <- c("Apoa1__chr9", "Apoa1bp__chr3", "Apoa2__chr1", "Apoa4__chr9", "Apoa5__chr9")
#plotexptsne(sc,g,n="Apoa genes",logsc=TRUE)
RaceID Results
## identification of marker genes
# differentially regulated genes in each cluster compared to the full ensemble
cdiff <- clustdiffgenes(sc,pvalue=.01)
setwd("D:/Documents/SCHOOL/VU/2017-2018 Master Year 2/Project/Seperate Scripts/lymphnode-sc-transcriptomics/data/4 - raceidstemid/All")
## write results to text files
# final clusters
x <- data.frame(CELLID=names(sc@cpart),cluster=sc@cpart)
write.table(x[order(x$cluster,decreasing=FALSE),],"cell_clust.xls",row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
# differentially expressed genes in cluster
for ( n in names(cdiff) ) write.table(data.frame(GENEID=rownames(cdiff[[n]]),cdiff[[n]]),paste(paste("cell_clust_diff_genes",sub("\\.","\\_",n),sep="_"),".xls",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
# differentially expressed genes between two sets of clusters, e. g. cluster 1 and clusters 2,3
d <- diffgenes(sc,cl1=1,cl2=c(2,3),mincount=5)
plotdiffgenes(d,gene=names(d$z)[1])

StemID Script
## StemID
# initialization
ltr <- Ltree(sc)
# computation of the entropy
ltr <- compentropy(ltr)
# computation of the projections for all cells
ltr <- projcells(ltr,cthr=2,nmode=FALSE)
# computation of the projections for all cells after randomization
ltr <- projback(ltr,pdishuf=100,nmode=FALSE,rseed=17000) # NOTE: TEMPORARILY CHANGED THIS TO 100 FOR TESTING PURPOSES, TAKES 10min, WAS 2000!
## pdishuffle: 1
## pdishuffle: 2
## pdishuffle: 3
## pdishuffle: 4
## pdishuffle: 5
## pdishuffle: 6
## pdishuffle: 7
## pdishuffle: 8
## pdishuffle: 9
## pdishuffle: 10
## pdishuffle: 11
## pdishuffle: 12
## pdishuffle: 13
## pdishuffle: 14
## pdishuffle: 15
## pdishuffle: 16
## pdishuffle: 17
## pdishuffle: 18
## pdishuffle: 19
## pdishuffle: 20
## pdishuffle: 21
## pdishuffle: 22
## pdishuffle: 23
## pdishuffle: 24
## pdishuffle: 25
## pdishuffle: 26
## pdishuffle: 27
## pdishuffle: 28
## pdishuffle: 29
## pdishuffle: 30
## pdishuffle: 31
## pdishuffle: 32
## pdishuffle: 33
## pdishuffle: 34
## pdishuffle: 35
## pdishuffle: 36
## pdishuffle: 37
## pdishuffle: 38
## pdishuffle: 39
## pdishuffle: 40
## pdishuffle: 41
## pdishuffle: 42
## pdishuffle: 43
## pdishuffle: 44
## pdishuffle: 45
## pdishuffle: 46
## pdishuffle: 47
## pdishuffle: 48
## pdishuffle: 49
## pdishuffle: 50
## pdishuffle: 51
## pdishuffle: 52
## pdishuffle: 53
## pdishuffle: 54
## pdishuffle: 55
## pdishuffle: 56
## pdishuffle: 57
## pdishuffle: 58
## pdishuffle: 59
## pdishuffle: 60
## pdishuffle: 61
## pdishuffle: 62
## pdishuffle: 63
## pdishuffle: 64
## pdishuffle: 65
## pdishuffle: 66
## pdishuffle: 67
## pdishuffle: 68
## pdishuffle: 69
## pdishuffle: 70
## pdishuffle: 71
## pdishuffle: 72
## pdishuffle: 73
## pdishuffle: 74
## pdishuffle: 75
## pdishuffle: 76
## pdishuffle: 77
## pdishuffle: 78
## pdishuffle: 79
## pdishuffle: 80
## pdishuffle: 81
## pdishuffle: 82
## pdishuffle: 83
## pdishuffle: 84
## pdishuffle: 85
## pdishuffle: 86
## pdishuffle: 87
## pdishuffle: 88
## pdishuffle: 89
## pdishuffle: 90
## pdishuffle: 91
## pdishuffle: 92
## pdishuffle: 93
## pdishuffle: 94
## pdishuffle: 95
## pdishuffle: 96
## pdishuffle: 97
## pdishuffle: 98
## pdishuffle: 99
## pdishuffle: 100
# assembly of the lineage tree
ltr <- lineagetree(ltr,pthr=0.01,nmode=FALSE)
# determination of significant differentiation trajectories
ltr <- comppvalue(ltr,pethr=0.01,nmode=FALSE)
StemID Diagnostic Plots
## diagnostic plots
# histogram of ratio between cell-to-cell distances in the embedded and the input space
plotdistanceratio(ltr)

# t-SNE map of the clusters with more than cthr cells including a minimum spanning tree for the cluster medoids
plotmap(ltr)

# visualization of the projections in t-SNE space overlayed with a minimum spanning tree connecting the cluster medoids
plotmapprojections(ltr)

# lineage tree showing the projections of all cells in t-SNE space
plottree(ltr,showCells=TRUE,nmode=FALSE,scthr=0)

# lineage tree without showing the projections of all cells
plottree(ltr,showCells=FALSE,nmode=FALSE,scthr=0)

# heatmap of the enrichment p-values for all inter-cluster links
plotlinkpv(ltr)

# heatmap of the link score for all inter-cluster links
plotlinkscore(ltr)

# heatmap showing the fold enrichment (or depletion) for significantly enriched or depleted links
projenrichment(ltr)

StemID Results
## extract projections onto all links for all cells in a given cluster i
x <- getproj(ltr,i=1)
# heatmap of all projections for cluster i
pheatmap(x$pr)

# heatmap of z-score for all projections for cluster i
pheatmap(x$prz)

## extracting all cells on two branches sharing the same cluster and computing differentially expressed genes between these two branches
x <- branchcells(ltr,list("1.3","1.2"))
# z-scores for differentially expressed genes
head(x$diffgenes$z)
## mt-Co3 Rps6 Gm28037 Eef1a1 Gm29055 Rpl28
## 8.535748 7.046961 6.069958 5.783058 5.607424 5.133549
# plotting the cells on the two branches as additional clusters in the t-SNE map
plottsne(x$scl)

## computing the StemID score
x <- compscore(ltr,nn=1)
#plotting the StemID score
plotscore(ltr,1)
